

# Setup ----------------------------------------------------------------------------------
  # Load packages
  library(pacman)
  p_load(fastverse, fixest, latex2exp, showtext, here)
  fastverse_extend(topics = c('DT', 'ST', 'VI'))
  # Load fonts
  showtext_auto()
  font_add(family = 'Charter', regular = '/System/Library/Fonts/Supplemental/Charter.ttc')
  # Define colors
  dark_grey    = "#7D7D7D"
  light_grey   = "#C7C7C7"
  v_light_grey = "#EEEEEE"
  mid_grey     = "#A2A2A2"
  purple       = "#6A1B9A"
  red_pink     = "#E91E63"
  orange       = "#FFA000"


# Load data: Zipcode income --------------------------------------------------------------
  # Load the dataset
  income_dt = here(
    'DataRaw', 'ZipIncome',
    'Personal_Income_Tax_Statistics_By_Zip_Code.csv'
  ) %>% fread()
  # Clean names
  setnames(income_dt, income_dt[0,] %>% janitor::clean_names() %>% names())
  # Grab desired columns
  income_dt %<>% .[, .(
    year = as.integer(taxable_year),
    zip = as.integer(zip_code),
    income = ca_agi / 1e3,
    n_returns = returns
  )]


# Load data: CARE summaries --------------------------------------------------------------
  # Load PG&E dataset
  pge_dt = here(
    'DataR', 'Summaries',
    'zip5PGE.rds'
  ) %>% readRDS() %T>% setDT()  
  # Load SoCal dataset
  socal_dt = here(
    'DataR', 'Summaries',
    'zip5SoCal.rds'
  ) %>% readRDS() %T>% setDT()


# Data work ------------------------------------------------------------------------------
  # Add year variable
  pge_dt[, year := str_sub(ym, 1, 4)]
  socal_dt[, year := str_sub(ym, 1, 4)]
  # Drop observations missing CARE percent (44 zips in SoCal)
  pge_dt %<>% .[!is.na(care_pct)]
  socal_dt %<>% .[!is.na(care_pct)]
  # Focus on 2008-2014
  pge_dt %<>% .[between(year, 2008, 2014)]
  socal_dt %<>% .[between(year, 2008, 2014)]
  # Find zip-year mean CARE percent
  pge_dt %<>% .[, .(
    care_pct = fmean(care_pct, w = hh_count),
    n_obs = fsum(hh_count),
    utility = ffirst(utility)
  ), by = .(zip = zip5, year)]
  socal_dt %<>% .[, .(
    care_pct = fmean(care_pct, w = hh_count),
    n_obs = fsum(hh_count),
    utility = ffirst(utility)
  ), by = .(zip = zip5, year)]
  # Stack datasets
  full_dt = rbindlist(list(
    pge_dt,
    socal_dt
  ))
  # Collapse to zip-year (currently also has utility)
  full_dt %<>% .[, .(
    care_pct = fmean(care_pct, w = n_obs),
    n_obs = fsum(n_obs),
    utilities = funique(utility) %>% sort() %>% paste0(collapse = ',')
  ), by = .(zip, year)]
  # To integer
  full_dt[, `:=`(
    year = year %>% as.integer(),
    zip = zip %>% as.integer()
  )]
  # Merge with income data
  full_dt %<>% merge(
    y = income_dt,
    by = c('year', 'zip'),
    all.x = TRUE,
    all.y = FALSE
  )


# Analysis -------------------------------------------------------------------------------
  # Correlation of per capita income and percent CARE
  feols(
    income / n_returns ~ care_pct,
    data = full_dt
  ) %>% etable(vcov = 'hetero')  
  # Add year fixed effects
  feols(
    income / n_returns ~ care_pct | year,
    data = full_dt
  ) %>% etable(vcov = 'hetero') 
  # Weight by number of observations
  feols(
    income / n_returns ~ care_pct | year,
    data = full_dt,
    weights = ~n_obs
  ) %>% etable(vcov = 'hetero') 
  # Weight by number of tax returns
  feols(
    income / n_returns ~ care_pct | year,
    data = full_dt,
    weights = ~n_returns
  ) %>% etable(vcov = 'hetero') 
  # Estimate separately for each year (weighting by number of observations)
  feols(
    income / n_returns ~ care_pct | year,
    data = full_dt,
    weights = ~n_obs,
    fsplit = ~year
  ) %>% etable(vcov = 'hetero') 


# Figure ---------------------------------------------------------------------------------
  # Collapse dataset
  agg_dt = full_dt[, .(
    income = fmean(income, w = n_obs),
    n_returns = fmean(n_returns, w = n_obs),
    care_pct = fmean(care_pct, w = n_obs),
    n_obs = fsum(n_obs),
    any_pge = str_detect(utilities, 'pge') %>% fmax(),
    any_socal = str_detect(utilities, 'socal') %>% fmax()
  ), by = zip]
  # Variable for utilites
  agg_dt[, `:=`(
    utility = fcase(
      any_pge == 1 & any_socal == 0, 'PG&E',
      any_pge == 0 & any_socal == 1, 'SoCalGas',
      any_pge == 1 & any_socal == 1, 'Both'
    )
  )]

  # Plot
  gg = ggplot(
    data = agg_dt,
    aes(
      x = income * 1e3 / n_returns,
      y = care_pct,
      color = utility,
      size = utility
    )
  ) +
  geom_point(alpha = 0.8, stroke = 0) +
  scale_x_log10('Income per capita (log scale)', labels = scales::dollar) +
  scale_y_continuous('Percent of bills with CARE', labels = scales::percent) +
  scale_color_manual('Utility', values = c('#e40045', '#ffc107', '#310769')) +
  scale_size_manual('Utility', values = c(3.5, 1.5, 1.5)) +
  # labs(
  #   title = TeX(paste(
  #     '\\textbf{Income and CARE status:}',
  #     'Zip-code income \\textit{per capita} and share of bills with CARE'
  #   )),
  #   subtitle = 'Zip-code averages 2008–2014 for 1,190 zip codes served by PG&E and/or SoCalGas',
  #   # caption = TeX(paste(
  #   #   '\\textit{Notes:}',
  #   #   'CARE/utility data come from the respective utilities; income data come from California Franchise Tax Board'
  #   # ))
  # ) +
  theme_minimal(base_family = 'Charter', base_size = 12) +
  theme(legend.position = 'bottom')
  # Save the plot
  cowplot::save_plot(
    plot = gg,
    filename = here('Output', 'zip-care-income.pdf'),
    # filename = here('Output', 'zip-care-income.png'),
    base_width = 6.5,
    base_height = 4.5
  )
